#Calculate rolling average for Health board equals Scotland
trend_hb_daily_roll_avg <- calculate_roll_avg("positive")
#convert the data frame to time series to use the range slider and selector
trend_hb_daily_roll_avg_ts <- convert_to_timeseries(trend_hb_daily_roll_avg)
# Create the plot using plot_ly
trend_positive_plot <- plot_ly(trend_hb_daily_roll_avg_ts, x = ~Date,
hoverinfo ="text",
text = ~paste("Date: ", Date,"<br>",
"Daily Positive: ", daily_positive,"<br>",
"Seven Day Average: ",seven_day_roll_avg))
#add bar chart for daily positive
trend_positive_plot <- trend_positive_plot %>% add_bars(y = ~daily_positive,
name = "Daily Positive",
color = I("#5ab4ac")
)
#add line for rolling average
trend_positive_plot <- trend_positive_plot %>% add_lines(y = ~seven_day_roll_avg,
name = "Seven Day Rolling Average",
color = I("red"))
#add layout to the existing plot
trend_positive_plot <- trend_positive_plot %>% layout(
title = list(text ="Trend on Positive Cases"),
xaxis = xaxis_list,
yaxis = yaxis_list,
legend = list(title=list(text='<b> Trend </b>'),
x = 0.1, y = 0.9))
trend_positive_plot
NA
Data for the report (Presentation)
total_positive_case <- trend_hb_daily %>%
filter(hb_name == "Scotland") %>%
filter(date == max(date))
recent_7days <- calculate_roll_avg("positive") %>%
filter (date >= max(date)-7) %>%
summarise(total = sum(daily_positive))
previous_7days <- calculate_roll_avg("positive") %>%
filter (date >= max(date)-14) %>%
filter (date < max(date)-7) %>%
summarise(total = sum(daily_positive))
percentage_calc = (recent_7days -previous_7days )/previous_7days * 100
percentage_calc
# decrease of 13.04 %
total_hospital <- trend_hb_daily %>%
filter(hb_name == "Scotland") %>%
summarise(total = sum(hospital_admissions, na.rm = TRUE))
recent_7days_hospital <- calculate_roll_avg("hospital") %>%
filter (!(is.na(seven_day_roll_avg))) %>%
filter (date >= max(date)-7) %>%
summarise(total = sum(hospital_admissions))
previous_7days_hospital <- calculate_roll_avg("hospital") %>%
filter (!(is.na(seven_day_roll_avg))) %>%
filter (date >= max(date)-14) %>%
filter (date < max(date)-7) %>%
summarise(total = sum(hospital_admissions))
percentage_calc_hosp = (recent_7days_hospital -previous_7days_hospital )/previous_7days_hospital * 100
percentage_calc_hosp
# decrease of 15.11 %
icu_hospital <- trend_hb_daily %>%
filter(hb_name == "Scotland") %>%
summarise(total = sum(icu_admissions, na.rm = TRUE))
cum_deaths <- trend_hb_daily %>%
filter(hb_name == "Scotland") %>%
filter (date == max(date)) %>%
select (cumulative_deaths)
daily_vacc_hb %>%
filter(hb_name == "Scotland",
sex == "Total",
age_group == "All vaccinations") %>%
filter (dose == "Dose 1") %>%
filter (date == max(date))
#Calculate rolling average for Health board equals Scotland
trend_hb_hosp_roll_avg <- calculate_roll_avg("hospital")
#convert the data frame to time series to use the range slider and selector
trend_hb_hosp_roll_avg_ts <- convert_to_timeseries(trend_hb_hosp_roll_avg)
# Create the plot using plot_ly
trend_hospital_plot <- plot_ly(trend_hb_hosp_roll_avg_ts, x = ~Date,
hoverinfo ="text",
text = ~paste("Date: ", Date,"<br>",
"Patients admitted: ", hospital_admissions,"<br>",
"Seven Day Average: ",seven_day_roll_avg))
#add bar chart for daily positive
trend_hospital_plot <- trend_hospital_plot %>% add_bars(y = ~hospital_admissions,
name = "Patients admitted",
color = I("#5ab4ac")
)
#add line for rolling average
trend_hospital_plot <- trend_hospital_plot %>% add_lines(y = ~seven_day_roll_avg,
name = "Seven Day Rolling Average",
color = I("red"))
#add layout to the existing plot
trend_hospital_plot <- trend_hospital_plot %>% layout(
title = list(text ="Hospitalisation"),
xaxis = xaxis_list,
yaxis = yaxis_list,
legend = list(title=list(text='<b> Trend </b>'),
x = 0.1, y = 0.9))
trend_hospital_plot
Warning: Ignoring 2 observations
Warning: Ignoring 2 observations
#Calculate rolling average for Health board equals Scotland
trend_hb_death_roll_avg <- calculate_roll_avg("death")
#convert the data frame to time series to use the range slider and selector
trend_hb_death_roll_avg_ts <- convert_to_timeseries(trend_hb_death_roll_avg)
# Create the plot using plot_ly
trend_death_plot <- plot_ly(trend_hb_death_roll_avg_ts, x = ~Date,
hoverinfo ="text",
text = ~paste("Date: ", Date,"<br>",
"Daily Deaths: ", daily_deaths,"<br>",
"Seven Day Average: ",seven_day_roll_avg))
#add bar chart for daily positive
trend_death_plot <- trend_death_plot %>% add_bars(y = ~daily_deaths,
name = "Daily Deaths",
color = I("#5ab4ac")
)
#add line for rolling average
trend_death_plot <- trend_death_plot %>% add_lines(y = ~seven_day_roll_avg,
name = "Seven Day Rolling Average",
color = I("red"))
#add layout to the existing plot
trend_death_plot <- trend_death_plot %>% layout(
title = list(text ="Deaths reported"),
xaxis = xaxis_list,
yaxis = yaxis_list,
legend = list(title=list(text='<b> Trend </b>'),
x = 0.1, y = 0.9))
trend_death_plot
NA
“PHS_COVID Vaccination Prediction Model Using ARIMA”
trend_vacc_hb <- daily_vacc_hb %>%
filter(hb_name == "Scotland") %>%
filter(sex =="Total") %>%
filter(age_group == "All vaccinations") %>%
filter(cumulative_number_vaccinated!=0)
#Plot to visualize trend on vaccination.
plot_vaccine <- trend_vacc_hb %>%
ggplot()+
aes(x = date, y = number_vaccinated)+
geom_line(aes(color = dose))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("People Vaccinated") +
xlab("Year") +
ylab("No of Positive Cases") +
color_theme()+
scale_colour_manual(values = c("#f1a340", "#5ab4ac"))
ggplotly(plot_vaccine)
# Identify the population
dose_population <- daily_vacc_hb %>%
filter(sex == "Total") %>%
filter(date == max(date)) %>%
filter(hb_name =="Scotland") %>%
filter(age_group =="16 years and over") %>%
select(population) %>%
distinct()
#Plot to visualise cumulative vaccination trend.
plot_vaccine_cumm <- trend_vacc_hb %>%
ggplot()+
aes(x = date, y = cumulative_number_vaccinated)+
geom_line(aes(color = dose))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Cummulative Trend on Vaccination") +
xlab("Year") +
ylab("No of People Vaccinated") +
color_theme()+
scale_colour_manual(values = c("#f1a340", "#5ab4ac"))+
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6),
sec.axis = sec_axis(trans = ~./dose_population$population,
name = "Percentage",
labels = scales::label_percent(accuracy = 0.01)
))
plotly::ggplotly(plot_vaccine_cumm)
NA
cumm_vac_age<- daily_vacc_hb %>%
filter(sex == "Total") %>%
filter(is.na(age_group_qf)) %>%
filter(date == max(date)) %>%
filter(hb_name =="Scotland") %>%
filter(age_group !="All vaccinations") %>%
mutate (cumulative_percent_coverage = ifelse(cumulative_percent_coverage >100, 100,
round(cumulative_percent_coverage,2))) %>%
select(dose,age_group, cumulative_percent_coverage, population)
cumm_vac_age_plot <- cumm_vac_age %>%
ggplot()+
aes(x = age_group, y = cumulative_percent_coverage)+
geom_col(aes(fill = dose), position = "dodge")+
# geom_text(aes(label=cumulative_percent_coverage, hjust = 0),
# position=position_dodge(width=0.9),angle = 90)+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Percentage Coverage of Vaccination by age group") +
xlab("Age group") +
ylab("% of Coverage") +
color_theme()+
scale_fill_manual(values = c("#f1a340", "#5ab4ac"))
ggplotly(cumm_vac_age_plot)
#position=position_dodge(width=0.9), vjust=-0.25
Data Preparation
trend_vacc_hb <- trend_vacc_hb %>%
filter (dose == "Dose 2") %>%
select(date,cumulative_number_vaccinated)
# Convert it to zoo type
daily_vacc_hb_zoo <- zoo(trend_vacc_hb$cumulative_number_vaccinated,
order.by=as.Date(trend_vacc_hb$date, format='%m/%d/%Y'))
# Convert it into a time series
daily_vacc_hb_timeseries <-timeSeries::as.timeSeries(daily_vacc_hb_zoo)
original_series<-
autoplot(daily_vacc_hb_timeseries, colour = '#5ab4ac')+
xlab("Month") +
ylab("VACCINATED")+
#scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Original Series") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
ggplotly(original_series)
Identify whether the time series is stationary / non stationary we can use ADF Augmented Dickey-Fuller test
adf_test <- adf.test(daily_vacc_hb_timeseries)
The time series is not stationary since we have a high p-value. So we apply difference
first_diff_ts<- diff(daily_vacc_hb_timeseries)
adf_test1 <- adf.test(na.omit(first_diff_ts))
second_diff_ts<- diff(first_diff_ts)
adf_test2 <- adf.test(na.omit(second_diff_ts))
Warning in adf.test(na.omit(second_diff_ts)) :
p-value smaller than printed p-value
adf_test1
Augmented Dickey-Fuller Test
data: na.omit(first_diff_ts)
Dickey-Fuller = -0.63526, Lag order = 6, p-value = 0.9753
alternative hypothesis: stationary
adf_test2
Augmented Dickey-Fuller Test
data: na.omit(second_diff_ts)
Dickey-Fuller = -6.2431, Lag order = 6, p-value = 0.01
alternative hypothesis: stationary
Create a dataframe to compare
adf_data <- data.frame(Data = c("Original", "First-Ordered", "Second Ordered"),
Dickey_Fuller = c(adf_test$statistic, adf_test1$statistic, adf_test2$statistic),
p_value = c(adf_test$p.value,adf_test1$p.value,adf_test2$p.value))
adf_data
Initially the p-value is high which indicates that the Time Series is not stationary. So we apply difference 2 times. After the second difference, the p-value < significance level (0.05) So we can conclude that the difference data are stationary. So difference (d = 2)
Other method to confirm
ndiffs(daily_vacc_hb_timeseries)
[1] 2
Let’s plot the First Order and Second Order Difference Series
Order of first difference
first_order<- autoplot(first_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("VACCINATED")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("First-Order Difference") +
color_theme()
ggplotly(first_order)
Order of Second difference
second_order<- autoplot(second_diff_ts, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("VACCINATED")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Second-Order Difference") +
color_theme()
ggplotly(second_order)
For our model ARIMA (p,d,q), we found d = 2, the next step is to get the values of p and q, the order of AR and MA part. Plot ACF and PACF charts to identify q and p respectively.
[1] 0.91 0.87 0.83 0.81 0.80 0.80 0.83 0.76 0.72 0.68 0.66 0.65 0.65 0.68 0.60 0.56 0.53 0.52 0.53 0.55 0.57 0.52 0.49
[24] 0.46 0.45 0.45 0.46
[1] -0.30 0.00 -0.11 -0.08 -0.03 -0.19 0.61 -0.18 -0.05 -0.06 -0.09 -0.04 -0.13 0.53 -0.17 -0.06 -0.12 -0.07 -0.08
[20] -0.05 0.47 -0.17 0.01 -0.09 -0.05 -0.07 -0.09
[1] 0.91 0.25 0.06 0.12 0.15 0.13 0.31 -0.47 -0.11 0.08 0.01 0.05 0.07 0.05 -0.25 -0.04 0.04 0.13 0.10
[20] 0.11 0.03 -0.13 0.03 -0.05 -0.06 -0.01 -0.01
[1] -0.30 -0.10 -0.15 -0.18 -0.16 -0.36 0.48 0.12 -0.10 -0.02 -0.06 -0.08 -0.06 0.23 0.03 -0.05 -0.16 -0.12 -0.13
[20] -0.04 0.11 -0.05 0.06 0.07 0.02 0.01 -0.12
The ACF and PACF plots of the differenced data show the following patterns:
The ACF doesn’t follow a sinusoidal pattern but its slowly geometric decay. Also there is a significant spike at lag 3 in the PACF, but none beyond lag 3. So the data may follow an AR(3) model
The PACF is sinusoidal and decaying. Also there is a significant spike at lag 2 in the ACF, but none beyond lag 2 So the data may follow an MA(2) model
So we propose three ARMA models for the differenced data: ARMA(p,q) ARMA(3,2), ARMA(3,0) and ARMA(0,2).
That is, for the original time series, we propose three ARIMA models,ARIMA(p,d,q) ARIMA(3,1,2), ARIMA(3,1,0) and ARMA(3,1,2).
arima_fit1 = Arima(daily_vacc_hb_timeseries, order = c(3,1,2))
arima_fit2 = Arima(daily_vacc_hb_timeseries, order = c(3,1,0))
arima_fit3 = Arima(daily_vacc_hb_timeseries, order = c(3,1,2))
arima_fit4 = Arima(daily_vacc_hb_timeseries, order = c(3,1,1))
summary(arima_fit1)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
-0.8033 0.7639 0.9315 1.6734 0.8412
s.e. 0.0267 0.0254 0.0252 0.0465 0.0449
sigma^2 estimated as 18863412: log likelihood=-2702.84
AIC=5417.68 AICc=5417.99 BIC=5439.4
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 433.8442 4295.907 2469.754 0.5542756 1.952307 0.1774188 -0.07419684
summary(arima_fit2)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,0)
Coefficients:
ar1 ar2 ar3
0.6652 0.2214 0.0863
s.e. 0.0598 0.0707 0.0597
sigma^2 estimated as 20641665: log likelihood=-2715.78
AIC=5439.56 AICc=5439.71 BIC=5454.04
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 387.7177 4510.387 2628.715 0.6106367 2.046611 0.188838 -0.01752897
summary(arima_fit3)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
-0.8033 0.7639 0.9315 1.6734 0.8412
s.e. 0.0267 0.0254 0.0252 0.0465 0.0449
sigma^2 estimated as 18863412: log likelihood=-2702.84
AIC=5417.68 AICc=5417.99 BIC=5439.4
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 433.8442 4295.907 2469.754 0.5542756 1.952307 0.1774188 -0.07419684
summary(arima_fit4)
Series: daily_vacc_hb_timeseries
ARIMA(3,1,1)
Coefficients:
ar1 ar2 ar3 ma1
1.3482 -0.3143 -0.0387 -0.7437
s.e. 0.1077 0.1081 0.0745 0.0882
sigma^2 estimated as 19748943: log likelihood=-2709.19
AIC=5428.37 AICc=5428.6 BIC=5446.48
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 284.0437 4403.688 2569.029 0.6859451 1.971568 0.1845503 0.002057572
Forecast the Manual ARIMA model
# Forecast the manual models
future = forecast(arima_fit1, h = 30)
future2 = forecast(arima_fit2, h = 30)
future3 = forecast(arima_fit3, h = 30)
future4 = forecast(arima_fit4, h = 30)
#Plot the forecasted manual models
par(mfrow = c(2,2))
plot(future)
plot(future2)
plot(future3)
plot(future4)
auto_arima_fit_vacc <- auto.arima(daily_vacc_hb_timeseries,
seasonal=FALSE,
stepwise = FALSE,
approximation = FALSE,
trace = TRUE
)
ARIMA(0,2,0) : 5442.016
ARIMA(0,2,1) : 5411.092
ARIMA(0,2,2) : 5407.077
ARIMA(0,2,3) : 5403.618
ARIMA(0,2,4) : 5403.786
ARIMA(0,2,5) : 5400.821
ARIMA(1,2,0) : 5419.005
ARIMA(1,2,1) : 5404.292
ARIMA(1,2,2) : 5406.203
ARIMA(1,2,3) : 5404.969
ARIMA(1,2,4) : 5402.265
ARIMA(2,2,0) : 5418.531
ARIMA(2,2,1) : 5406.064
ARIMA(2,2,2) : 5407.433
ARIMA(2,2,3) : 5391.866
ARIMA(3,2,0) : 5414.38
ARIMA(3,2,1) : 5403.501
ARIMA(3,2,2) : 5362.21
ARIMA(4,2,0) : 5407.179
ARIMA(4,2,1) : 5401.006
ARIMA(5,2,0) : 5402.384
Best model: ARIMA(3,2,2)
summary(auto_arima_fit_vacc)
Series: daily_vacc_hb_timeseries
ARIMA(3,2,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.8243 -0.4487 -0.4145 -1.2768 0.9217
s.e. 0.0606 0.0713 0.0574 0.0361 0.0315
sigma^2 estimated as 16635995: log likelihood=-2674.95
AIC=5361.9 AICc=5362.21 BIC=5383.6
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 16.38254 4026.859 2315.206 0.4405824 1.835199 0.1663165 -0.0592707
Model Selection Criteria :
ARIMA models with minimum AIC, RMSE and MAPE criteria were chosen as the best models. Automated ARIMA confirms that the ARIMA(3, 2, 2) seems good based on AIC
lmtest::coeftest(auto_arima_fit_vacc)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
ar1 0.824250 0.060551 13.6124 < 2.2e-16 ***
ar2 -0.448709 0.071338 -6.2899 3.176e-10 ***
ar3 -0.414539 0.057388 -7.2234 5.069e-13 ***
ma1 -1.276757 0.036124 -35.3440 < 2.2e-16 ***
ma2 0.921711 0.031529 29.2335 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
All the coefficients are statistically significant.
Let’s plot the diagnostics with the results to make sure the normality and correlation assumptions for the model hold. If the residuals look like white noise, proceed with forecast and prediction, otherwise repeat the model building.
res <- checkresiduals(auto_arima_fit_vacc, theme = color_theme())
Ljung-Box test
data: Residuals from ARIMA(3,2,2)
Q* = 103.21, df = 5, p-value < 2.2e-16
Model df: 5. Total lags used: 10
res
Ljung-Box test
data: Residuals from ARIMA(3,2,2)
Q* = 103.21, df = 5, p-value < 2.2e-16
The ACF plot of the residuals from the ARIMA(3,2,2) model shows that almost auto correlationswith regular interval outlier. A portmanteau test returns a smaller p-value (almost close to Zero), also suggesting that the residuals are white noise.
Fitting the ARIMA model with the existing data
The residual errors seem fine with near zero mean and uniform variance. Let’s plot the actuals against the fitted values
#Convert the model to dataframe for plotting
daily_vacc_hb_timeseries_data <- fortify(daily_vacc_hb_timeseries) %>%
clean_names() %>%
remove_rownames %>%
rename (date = index,
vacc = data)%>%
mutate(index = seq(1:nrow(daily_vacc_hb_timeseries)))
arima_fit_resid <- ts(daily_vacc_hb_timeseries) - resid(auto_arima_fit_vacc)
arima_fit_data <- fortify(arima_fit_resid) %>%
clean_names() %>%
mutate(data = round(data,2))
fit_existing_data <- daily_vacc_hb_timeseries_data %>%
inner_join(arima_fit_data, by = c("index"))
#plotting the series along with the fitted values
fit_existing_data %>%
ggplot()+
aes(x=date, y = vacc)+
geom_line(color ="#5ab4ac")+
geom_line(aes(x= date, y = data), colour = "red" )+
xlab("Month") +
ylab("Number of People vaccinated")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Fitting the ARIMA model with existing data") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
Data Preparation :
#Convert the model to dataframe for plotting
forecast_model <- forecast(auto_arima_fit_vacc,level = c(80, 95), h = 60)
forecast_model_data <- fortify(forecast_model) %>%
clean_names() %>%
mutate(data = round(data,2),
fitted= round(fitted,2))
forecast_start_date <- as.Date(max(daily_vacc_hb_timeseries_data$date)+1)
forecast_end_date <- as.Date(forecast_start_date+59)
forecast_data <- forecast_model_data %>%
filter(!(is.na(point_forecast))) %>%
mutate(date = seq(forecast_start_date,forecast_end_date, by =1)) %>%
select(-data,-fitted, -index)
fitted_data <- forecast_model_data %>%
filter(!(is.na(data))) %>%
inner_join(daily_vacc_hb_timeseries_data, by = c("index")) %>%
mutate(date = as.Date(date)) %>%
select(date, data, fitted)
Plotting the Vaccination series plus the forecast and 95% prediction intervals
annotation <- data.frame(
x = c(as.Date("03-04-2021","%d-%m-%Y"),as.Date("31-10-2021","%d-%m-%Y")),
y = c(1000000,3000000),
label = c("PAST", "FUTURE")
)
#Time series plots for the next 60 days according to best ARIMA models with 80%–95% CI.
fitted_data %>%
ggplot()+
geom_line(aes(x= date, y = data))+
geom_line(aes(x= date, y = fitted), colour = "red" )+
geom_line(aes(x= date, y =point_forecast), data = forecast_data )+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_80, ymax = hi_80),
data = forecast_data, alpha = 0.3, fill = "green")+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_95, ymax = hi_95),
data = forecast_data, alpha = 0.1)+
ggtitle("Forecast") +
xlab("Month") +
ylab("Number of People vaccinated")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
color_theme()+
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
geom_text(data=annotation,
aes( x=x, y=y, label=label),
color="red",
size=4 )+
geom_vline(xintercept =as.Date("08-10-2021","%d-%m-%Y"), linetype = 2)
“Forecast on Hospitalisation (ARIMA Modelling)”
Data Preparation
#For forecasting, we chose the latest data
trend_hosp_hb <- trend_hb_daily %>%
filter (hb_name == "Scotland") %>%
filter(date >="2021-06-01") %>%
filter(!(is.na(hospital_admissions))) %>%
select(date, hospital_admissions)
# Convert it into a time series
daily_hosp_hb_zoo <- zoo(trend_hosp_hb$hospital_admissions,
order.by=as.Date(trend_hosp_hb$date, format='%m/%d/%Y'))
# Convert it into a time series
daily_hosp_hb_timeseries <- timeSeries::as.timeSeries(daily_hosp_hb_zoo)
original_series<-autoplot(daily_hosp_hb_timeseries, ts.colour = '#5ab4ac')+
xlab("Month") +
ylab("Number of People hospitalised")+
#scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Trend on Hospitalisation") +
color_theme()
ggplotly(original_series)
Identify whether the time series is stationary / non stationary we can use ADF Augmented Dickey-Fuller test
adf_test_hosp <- adf.test(daily_hosp_hb_timeseries)
adf_test_hosp
Augmented Dickey-Fuller Test
data: daily_hosp_hb_timeseries
Dickey-Fuller = -1.0883, Lag order = 4, p-value = 0.9209
alternative hypothesis: stationary
The time series is not stationary since we have a high p-value (p-value must be < 0.05). So we apply difference
first_diff_hosp<- diff(daily_hosp_hb_timeseries)
adf_test1_hosp <- adf.test(na.omit(first_diff_hosp))
Warning in adf.test(na.omit(first_diff_hosp)) :
p-value smaller than printed p-value
adf_test1_hosp
Augmented Dickey-Fuller Test
data: na.omit(first_diff_hosp)
Dickey-Fuller = -7.5072, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
Create a dataframe to compare
adf_data_hosp <- data.frame(Data = c("Original", "First-Ordered"),
Dickey_Fuller = c(adf_test_hosp$statistic, adf_test1_hosp$statistic),
p_value = c(adf_test_hosp$p.value,adf_test1_hosp$p.value))
adf_data_hosp
Initially the p-value is high which indicates that the Time Series is not stationary. So we apply difference 1 time. After the first difference, the p-value < significance level (0.05) So we can conclude that the difference data are stationary. So difference (d = 1)
Other method:
ndiffs(daily_hosp_hb_timeseries)
[1] 1
Let’s plot the First Order Difference Series
Order of first difference
p<- autoplot(first_diff_hosp, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("HOSPITALIZATION")+
# scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("First-Order Difference Series") +
color_theme()
ggplotly(p)
For our model ARIMA (p,d,q), we found d = 1, the next step is to get the values of p and q, the order of AR and MA part. Plot ACF and PACF charts to identify q and p respectively.
par(mfrow=c(2,2))
acf_hosp <- acf1(daily_hosp_hb_timeseries, col=2:7, lwd=4)
pacf_hosp <- acf1(daily_hosp_hb_timeseries, pacf = TRUE, col=2:7, lwd=4)
acf_hosp <- acf1(first_diff_hosp, col=2:7, lwd=4)
pacf_hosp <- acf1(first_diff_hosp, pacf = TRUE, col=2:7, lwd=4)
The ACF and PACF plots of the differenced data show the following patterns:
The ACF is sinusoidal and there is a significant spike at lag 3 in the PACF So the data may follow an AR(3) model
The PACF is sinusoidal and there is a significant spike at lag 2 in the ACF So the data may follow an MA(2) model
So we propose three ARMA models for the differenced data: ARMA(p,q) ARMA(3,2), ARMA(3,0) and ARMA(0,2).
That is, for the original time series, we propose three ARIMA models,ARIMA(p,d,q) ARIMA(3,1,2), ARIMA(3,1,0) and ARMA(0,1,2).
arima_fit_hosp_1 = Arima(daily_hosp_hb_timeseries, order = c(3,1,2))
arima_fit_hosp_2 = Arima(daily_hosp_hb_timeseries, order = c(3,1,0))
arima_fit_hosp_3 = Arima(daily_hosp_hb_timeseries, order = c(0,1,2))
summary(arima_fit_hosp_1)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.9213 -0.5824 -0.3100 -1.3525 1.0000
s.e. 0.0896 0.1115 0.0903 0.0328 0.0408
sigma^2 estimated as 208.2: log likelihood=-495.31
AIC=1002.61 AICc=1003.35 BIC=1019.39
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.2478801 14.06887 11.2004 -2.586156 17.88154 0.8402037 -0.05606368
summary(arima_fit_hosp_2)
Series: daily_hosp_hb_timeseries
ARIMA(3,1,0)
Coefficients:
ar1 ar2 ar3
-0.2589 -0.2312 -0.1612
s.e. 0.0934 0.0941 0.0931
sigma^2 estimated as 273.9: log likelihood=-509.84
AIC=1027.69 AICc=1028.03 BIC=1038.87
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.4623877 16.27678 12.7506 -2.946581 19.70259 0.9564925 -0.02137951
summary(arima_fit_hosp_3)
Series: daily_hosp_hb_timeseries
ARIMA(0,1,2)
Coefficients:
ma1 ma2
-0.2801 -0.1740
s.e. 0.0898 0.0775
sigma^2 estimated as 270.5: log likelihood=-509.6
AIC=1025.2 AICc=1025.4 BIC=1033.58
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.5474528 16.24428 12.69526 -2.910392 19.77577 0.9523412 0.0004090666
texreg::screenreg(list(arima_fit_hosp_1, arima_fit_hosp_2, arima_fit_hosp_3),
custom.model.names =c("ARIMA(3,1,2)","ARIMA(3,1,0)","ARIMA(0,1,4)"),
center = TRUE,
table = FALSE)
========================================================
ARIMA(3,1,2) ARIMA(3,1,0) ARIMA(0,1,4)
--------------------------------------------------------
ar1 0.92 *** -0.26 **
(0.09) (0.09)
ar2 -0.58 *** -0.23 *
(0.11) (0.09)
ar3 -0.31 *** -0.16
(0.09) (0.09)
ma1 -1.35 *** -0.28 **
(0.03) (0.09)
ma2 1.00 *** -0.17 *
(0.04) (0.08)
--------------------------------------------------------
AIC 1002.61 1027.69 1025.20
AICc 1003.35 1028.03 1025.40
BIC 1019.39 1038.87 1033.58
Log Likelihood -495.31 -509.84 -509.60
Num. obs. 121 121 121
========================================================
*** p < 0.001; ** p < 0.01; * p < 0.05
#Function for Automated ARIMA
auto_arima_fit_hosp <- auto.arima(lag(daily_hosp_hb_timeseries,k =lag_value ),
seasonal=FALSE,
stepwise=FALSE,
approximation=FALSE,
trace = TRUE
)
ARIMA(0,1,0) : 1017.948
ARIMA(0,1,0) with drift : 1019.919
ARIMA(0,1,1) : 1010.762
ARIMA(0,1,1) with drift : 1012.508
ARIMA(0,1,2) : 1008.339
ARIMA(0,1,2) with drift : 1009.948
ARIMA(0,1,3) : 1010.476
ARIMA(0,1,3) with drift : 1012.124
ARIMA(0,1,4) : 997.2108
ARIMA(0,1,4) with drift : 999.0964
ARIMA(0,1,5) : 992.7933
ARIMA(0,1,5) with drift : 994.8451
ARIMA(1,1,0) : 1014.76
ARIMA(1,1,0) with drift : 1016.692
ARIMA(1,1,1) : 1009.347
ARIMA(1,1,1) with drift : 1010.975
ARIMA(1,1,2) : 1010.478
ARIMA(1,1,2) with drift : 1012.125
ARIMA(1,1,3) : 1011.966
ARIMA(1,1,3) with drift : 1013.657
ARIMA(1,1,4) : Inf
ARIMA(1,1,4) with drift : Inf
ARIMA(2,1,0) : 1011.524
ARIMA(2,1,0) with drift : 1013.37
ARIMA(2,1,1) : 1010.109
ARIMA(2,1,1) with drift : 1011.745
ARIMA(2,1,2) : Inf
ARIMA(2,1,2) with drift : Inf
ARIMA(2,1,3) : Inf
ARIMA(2,1,3) with drift : Inf
ARIMA(3,1,0) : 1012.059
ARIMA(3,1,0) with drift : 1013.866
ARIMA(3,1,1) : 1011.959
ARIMA(3,1,1) with drift : 1013.614
ARIMA(3,1,2) : 984.7138
ARIMA(3,1,2) with drift : 986.4505
ARIMA(4,1,0) : 1012.409
ARIMA(4,1,0) with drift : 1014.144
ARIMA(4,1,1) : 1012.594
ARIMA(4,1,1) with drift : 1014.198
ARIMA(5,1,0) : 1006.976
ARIMA(5,1,0) with drift : 1008.398
Best model: ARIMA(3,1,2)
#Lag is used to best fit the model
auto_arima_fit_hosp <- auto.arima(lag(daily_hosp_hb_timeseries),
seasonal=FALSE,
stepwise=FALSE,
approximation=FALSE,
trace = TRUE
)
ARIMA(0,1,0) : 1017.948
ARIMA(0,1,0) with drift : 1019.919
ARIMA(0,1,1) : 1010.762
ARIMA(0,1,1) with drift : 1012.508
ARIMA(0,1,2) : 1008.339
ARIMA(0,1,2) with drift : 1009.948
ARIMA(0,1,3) : 1010.476
ARIMA(0,1,3) with drift : 1012.124
ARIMA(0,1,4) : 997.2108
ARIMA(0,1,4) with drift : 999.0964
ARIMA(0,1,5) : 992.7933
ARIMA(0,1,5) with drift : 994.8451
ARIMA(1,1,0) : 1014.76
ARIMA(1,1,0) with drift : 1016.692
ARIMA(1,1,1) : 1009.347
ARIMA(1,1,1) with drift : 1010.975
ARIMA(1,1,2) : 1010.478
ARIMA(1,1,2) with drift : 1012.125
ARIMA(1,1,3) : 1011.966
ARIMA(1,1,3) with drift : 1013.657
ARIMA(1,1,4) : Inf
ARIMA(1,1,4) with drift : Inf
ARIMA(2,1,0) : 1011.524
ARIMA(2,1,0) with drift : 1013.37
ARIMA(2,1,1) : 1010.109
ARIMA(2,1,1) with drift : 1011.745
ARIMA(2,1,2) : Inf
ARIMA(2,1,2) with drift : Inf
ARIMA(2,1,3) : Inf
ARIMA(2,1,3) with drift : Inf
ARIMA(3,1,0) : 1012.059
ARIMA(3,1,0) with drift : 1013.866
ARIMA(3,1,1) : 1011.959
ARIMA(3,1,1) with drift : 1013.614
ARIMA(3,1,2) : 984.7138
ARIMA(3,1,2) with drift : 986.4505
ARIMA(4,1,0) : 1012.409
ARIMA(4,1,0) with drift : 1014.144
ARIMA(4,1,1) : 1012.594
ARIMA(4,1,1) with drift : 1014.198
ARIMA(5,1,0) : 1006.976
ARIMA(5,1,0) with drift : 1008.398
Best model: ARIMA(3,1,2)
auto_arima_fit_hosp
Series: lag(daily_hosp_hb_timeseries)
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.942 -0.5638 -0.2542 -1.4378 0.9367
s.e. 0.098 0.1137 0.1001 0.0545 0.0409
sigma^2 estimated as 196.2: log likelihood=-485.99
AIC=983.97 AICc=984.71 BIC=1000.7
Automated ARIMA confirms that the ARIMA(3,1,2) seems good based on AIC
coef<-lmtest::coeftest(auto_arima_fit_hosp)
coef
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
ar1 0.942010 0.098031 9.6093 < 2.2e-16 ***
ar2 -0.563822 0.113727 -4.9577 7.135e-07 ***
ar3 -0.254174 0.100149 -2.5380 0.01115 *
ma1 -1.437823 0.054505 -26.3795 < 2.2e-16 ***
ma2 0.936722 0.040946 22.8771 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
All coefficients are significant except ar3.
ARIMA models with minimum AIC, RMSE and MAPE criteria were chosen as the best models. Based on Akaike Information Criterion (AIC) above, an ARIMA(3, 1, 2) model seems best.
Let’s plot the diagnostics with the results to make sure the normality and correlation assumptions for the model hold. If the residuals look like white noise, proceed with forecast and prediction, otherwise repeat the model building.
res <-checkresiduals(auto_arima_fit_hosp, theme = color_theme())
Ljung-Box test
data: Residuals from ARIMA(3,1,2)
Q* = 33.437, df = 5, p-value = 3.082e-06
Model df: 5. Total lags used: 10
res
Ljung-Box test
data: Residuals from ARIMA(3,1,2)
Q* = 33.437, df = 5, p-value = 3.082e-06
The ACF plot of the residuals from the ARIMA(3,1,2) model shows that all auto correlations are almost within the threshold limits, with residuals. A portmanteau test (Ljung-Box test) returns a smaller p-value , also suggesting that the residuals are white noise.
Fitting the ARIMA model with the existing data
The residual errors seem fine with near zero mean and uniform variance. Let’s plot the actuals against the fitted values
Convert model and time series to dataframe for plotting
daily_hosp_hb_timeseries_data <- fortify(daily_hosp_hb_timeseries) %>%
clean_names() %>%
remove_rownames %>%
rename (date = index,
hosp = data)%>%
mutate(index = seq(1:nrow(daily_hosp_hb_timeseries)))
arima_fit_resid <- ts(daily_hosp_hb_timeseries[1:nrow(daily_hosp_hb_timeseries)]) - resid(auto_arima_fit_hosp)
arima_fit_data <- fortify(arima_fit_resid) %>%
clean_names() %>%
mutate(data = round(data,2))
fit_existing_data <- daily_hosp_hb_timeseries_data %>%
inner_join(arima_fit_data, by = c("index"))
Plotting the series along with the fitted values
fit_existing_hosp_plot <- fit_existing_data %>%
mutate (date = as.Date(date)) %>%
ggplot()+
aes(x=date, y = hosp)+
geom_line(color ="#5ab4ac")+
geom_line(aes(x= date, y = data), colour = "red" )+
xlab("Month") +
ylab("Patient Hospitalised")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Fitting the ARIMA model with existing data") +
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
#scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6))+
color_theme()
ggplotly(fit_existing_hosp_plot)
Data Preparation :
forecast_model <- forecast(auto_arima_fit_hosp,level = c(80, 95), h = 30)
#Convert the model to dataframe for plotting
forecast_model_data <- fortify(forecast_model) %>%
clean_names() %>%
mutate(data = round(data,2),
fitted= round(fitted,2)) %>%
mutate (lo_80 = ifelse(lo_80 < 0,0,lo_80),
lo_95 = ifelse(lo_95 < 0,0,lo_95)
)
forecast_start_date <- as.Date(max(daily_hosp_hb_timeseries_data$date)+1)
forecast_end_date <- as.Date(forecast_start_date+29)
forecast_data <- forecast_model_data %>%
filter(!(is.na(point_forecast))) %>%
mutate(date = seq(forecast_start_date,forecast_end_date, by =1)) %>%
select(-data,-fitted, -index)
fitted_data <- forecast_model_data %>%
filter(!(is.na(data))) %>%
inner_join(daily_hosp_hb_timeseries_data, by = c("index")) %>%
mutate(date = as.Date(date)) %>%
select(date, data, fitted)
Plotting the Vaccination series plus the forecast and 80 - 95% prediction intervals
annotation <- data.frame(
x = c(as.Date("13-08-2021","%d-%m-%Y"),as.Date("31-10-2021","%d-%m-%Y")),
y = c(180,200),
label = c("PAST", "FUTURE")
)
#Time series plots for the next 60 days according to best ARIMA models with 80%–95% CI.
forecast_data_hosp_plot <-fitted_data %>%
ggplot()+
geom_line(aes(x= date, y = data), color = "#5ab4ac")+
#geom_line(aes(x= date, y = fitted), colour = "red" )+
geom_line(aes(x= date, y =point_forecast), color ="blue", size = 0.5,
data = forecast_data )+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_80, ymax = hi_80),
data = forecast_data, alpha = 0.3, fill = "green")+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_95, ymax = hi_95),
data = forecast_data, alpha = 0.1)+
geom_vline(aes(xintercept=as.numeric(max(date))),color="#f1a340", linetype="dashed",data = fitted_data)+
ggtitle("Projection of Hospitalisation") +
xlab("Month") +
ylab("Patient Hospitalised")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
color_theme()+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
geom_text(data=annotation,
aes( x=x, y=y, label=label),
color="blue",
size=4 )
ggplotly(forecast_data_hosp_plot)
NA
“Forecast on Deaths (ARIMA Modelling)”
Data Preparation
#For forecasting, we chose the latest data
trend_death_hb <- trend_hb_daily %>%
filter (hb_name == "Scotland") %>%
filter(date >="2021-06-01") %>%
filter(!(is.na(daily_deaths))) %>%
select(date, daily_deaths)
# Convert it into a time series
daily_death_hb_zoo <- zoo(trend_death_hb$daily_deaths,
order.by=as.Date(trend_death_hb$date, format='%m/%d/%Y'))
# Convert it into a time series
daily_death_hb_timeseries <- timeSeries::as.timeSeries(daily_death_hb_zoo)
original_series_death<-autoplot(daily_death_hb_timeseries, ts.colour = '#5ab4ac')+
xlab("Month") +
ylab("Patient died")+
#scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("Trend on Deaths") +
color_theme()
ggplotly(original_series_death)
Identify whether the time series is stationary / non stationary we can use ADF Augmented Dickey-Fuller test
adf_test_death <- adf.test(daily_death_hb_timeseries)
adf_test_death
Augmented Dickey-Fuller Test
data: daily_death_hb_timeseries
Dickey-Fuller = -1.4701, Lag order = 4, p-value = 0.7967
alternative hypothesis: stationary
The time series is not stationary since we have a high p-value (p-value must be < 0.05). So we apply difference
first_diff_death<- diff(daily_death_hb_timeseries)
adf_test1_death <- adf.test(na.omit(first_diff_death))
Warning in adf.test(na.omit(first_diff_death)) :
p-value smaller than printed p-value
adf_test1_death
Augmented Dickey-Fuller Test
data: na.omit(first_diff_death)
Dickey-Fuller = -5.5546, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
Create a dataframe to compare
adf_data_death <- data.frame(Data = c("Original", "First-Ordered"),
Dickey_Fuller = c(adf_test_death$statistic, adf_test1_death$statistic),
p_value = c(adf_test_death$p.value,adf_test1_death$p.value))
adf_data_death
Initially the p-value is high which indicates that the Time Series is not stationary. So we apply difference 1 time. After the first difference, the p-value < significance level (0.05) So we can conclude that the difference data are stationary. So difference (d = 1)
Other method:
ndiffs(daily_death_hb_timeseries)
[1] 1
Let’s plot the First Order Difference Series
Order of first difference
first_diff_death<- diff(daily_death_hb_timeseries)
p<- autoplot(first_diff_death, ts.colour = '#5ab4ac') +
xlab("Month") +
ylab("DEATH")+
# scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
ggtitle("First-Order Difference Series") +
color_theme()
ggplotly(p)
For our model ARIMA (p,d,q), we found d = 1, the next step is to get the values of p and q, the order of AR and MA part. Plot ACF and PACF charts to identify q and p respectively.
par(mfrow=c(2,2))
acf_death <- acf1(first_diff_death, col=2:7, lwd=4)
pacf_death <- acf1(first_diff_death, pacf = TRUE, col=2:7, lwd=4)
The ACF and PACF plots tapers to zeo at one point. So it follows an ARMA model
The PACF non-zero value at lag 3 So the data may follow an AR(3) model
The ACF non - zero value at lag 2 So it can be MA (2)
So we propose three ARMA models for the differenced data: ARMA(p,q) ARMA(3,2), ARMA(3,0) and ARMA(0,2).
That is, for the original time series, we propose three ARIMA models,ARIMA(p,d,q) ARIMA(3,1,2), ARIMA(3,1,0) and ARMA(0,1,2).
arima_fit_dth_1 = Arima(daily_death_hb_timeseries, order = c(3,1,2))
arima_fit_dth_2 = Arima(daily_death_hb_timeseries, order = c(3,1,0))
arima_fit_dth_3 = Arima(daily_death_hb_timeseries, order = c(0,1,2))
summary(arima_fit_dth_1)
Series: daily_death_hb_timeseries
ARIMA(3,1,2)
Coefficients:
ar1 ar2 ar3 ma1 ma2
0.9171 0.2154 -0.3663 -1.7837 0.9143
s.e. 0.1133 0.1232 0.1185 0.0652 0.0721
sigma^2 estimated as 8.088: log likelihood=-296.46
AIC=604.93 AICc=605.67 BIC=621.7
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.2407185 2.773069 2.025545 -Inf Inf 0.7208557 -0.0291094
summary(arima_fit_dth_2)
Series: daily_death_hb_timeseries
ARIMA(3,1,0)
Coefficients:
ar1 ar2 ar3
-0.8014 -0.2776 -0.091
s.e. 0.0913 0.1137 0.093
sigma^2 estimated as 8.593: log likelihood=-300.63
AIC=609.27 AICc=609.61 BIC=620.45
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.3192471 2.882911 2.161774 -Inf Inf 0.7693371 -0.04370216
summary(arima_fit_dth_3)
Series: daily_death_hb_timeseries
ARIMA(0,1,2)
Coefficients:
ma1 ma2
-0.8169 0.2285
s.e. 0.0882 0.0773
sigma^2 estimated as 8.548: log likelihood=-300.84
AIC=607.69 AICc=607.89 BIC=616.08
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.3544154 2.887548 2.145374 -Inf Inf 0.7635008 -0.04100283
Another way of checking AIC
texreg::screenreg(list(arima_fit_dth_1, arima_fit_dth_2, arima_fit_dth_3),
custom.model.names =c("ARIMA(3,1,2)","ARIMA(3,1,0)","ARIMA(0,1,2)"),
center = TRUE,
table = FALSE)
========================================================
ARIMA(3,1,2) ARIMA(3,1,0) ARIMA(0,1,2)
--------------------------------------------------------
ar1 0.92 *** -0.80 ***
(0.11) (0.09)
ar2 0.22 -0.28 *
(0.12) (0.11)
ar3 -0.37 ** -0.09
(0.12) (0.09)
ma1 -1.78 *** -0.82 ***
(0.07) (0.09)
ma2 0.91 *** 0.23 **
(0.07) (0.08)
--------------------------------------------------------
AIC 604.93 609.27 607.69
AICc 605.67 609.61 607.89
BIC 621.70 620.45 616.08
Log Likelihood -296.46 -300.63 -300.84
Num. obs. 121 121 121
========================================================
*** p < 0.001; ** p < 0.01; * p < 0.05
Based on this, ARIMA model (3,1,2) seems best. We can verify the same using automated method
auto_arima_fit_death <- auto.arima(lag(daily_death_hb_timeseries),
seasonal=FALSE,
stepwise=FALSE,
approximation=FALSE,
trace = TRUE
)
ARIMA(0,1,0) : 661.2495
ARIMA(0,1,0) with drift : 662.9805
ARIMA(0,1,1) : 604.9241
ARIMA(0,1,1) with drift : 603.4592
ARIMA(0,1,2) : 600.5927
ARIMA(0,1,2) with drift : 600.1606
ARIMA(0,1,3) : 602.1154
ARIMA(0,1,3) with drift : 601.3862
ARIMA(0,1,4) : 597.6822
ARIMA(0,1,4) with drift : 597.7631
ARIMA(0,1,5) : 599.8704
ARIMA(0,1,5) with drift : 599.9091
ARIMA(1,1,0) : 605.4993
ARIMA(1,1,0) with drift : 606.5658
ARIMA(1,1,1) : 599.2604
ARIMA(1,1,1) with drift : 598.7763
ARIMA(1,1,2) : 601.2259
ARIMA(1,1,2) with drift : 600.6498
ARIMA(1,1,3) : 603.1702
ARIMA(1,1,3) with drift : 602.3916
ARIMA(1,1,4) : 599.8811
ARIMA(1,1,4) with drift : 599.963
ARIMA(2,1,0) : 601.7416
ARIMA(2,1,0) with drift : 602.2924
ARIMA(2,1,1) : 601.1992
ARIMA(2,1,1) with drift : 600.5597
ARIMA(2,1,2) : 603.3692
ARIMA(2,1,2) with drift : 602.7661
ARIMA(2,1,3) : 592.0755
ARIMA(2,1,3) with drift : Inf
ARIMA(3,1,0) : 603.1214
ARIMA(3,1,0) with drift : 603.4379
ARIMA(3,1,1) : 603.3196
ARIMA(3,1,1) with drift : 602.7205
ARIMA(3,1,2) : 599.3322
ARIMA(3,1,2) with drift : 600.2376
ARIMA(4,1,0) : 596.0037
ARIMA(4,1,0) with drift : 594.9402
ARIMA(4,1,1) : 594.7462
ARIMA(4,1,1) with drift : 594.1175
ARIMA(5,1,0) : 593.284
ARIMA(5,1,0) with drift : 593.3211
Best model: ARIMA(2,1,3)
auto_arima_fit_death
Series: lag(daily_death_hb_timeseries)
ARIMA(2,1,3)
Coefficients:
ar1 ar2 ma1 ma2 ma3
-1.6080 -0.9445 0.8866 0.0474 -0.4563
s.e. 0.0583 0.0518 0.0987 0.1133 0.0984
sigma^2 estimated as 7.525: log likelihood=-289.67
AIC=591.33 AICc=592.08 BIC=608.06
However Automated ARIMA also confirms that the ARIMA(2,1,3) seems good based on AIC
coef_dth<-lmtest::coeftest(auto_arima_fit_death)
coef_dth
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
ar1 -1.608022 0.058276 -27.5932 < 2.2e-16 ***
ar2 -0.944495 0.051822 -18.2257 < 2.2e-16 ***
ma1 0.886589 0.098691 8.9835 < 2.2e-16 ***
ma2 0.047364 0.113311 0.4180 0.6759
ma3 -0.456307 0.098377 -4.6384 3.512e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Model Selection Criteria :
ARIMA models with minimum AIC, RMSE and MAPE criteria were chosen as the best models. Based on Akaike Information Criterion (AIC) above, an ARIMA(2, 1, 3) model seems best.
Let’s plot the diagnostics with the results to make sure the normality and correlation assumptions for the model hold. If the residuals look like white noise, proceed with forecast and prediction, otherwise repeat the model building.
res_dth <-checkresiduals(auto_arima_fit_death, theme = color_theme())
Ljung-Box test
data: Residuals from ARIMA(2,1,3)
Q* = 6.9535, df = 5, p-value = 0.2241
Model df: 5. Total lags used: 10
res_dth
Ljung-Box test
data: Residuals from ARIMA(2,1,3)
Q* = 6.9535, df = 5, p-value = 0.2241
The ACF plot of the residuals from the ARIMA(2,1,3) model shows that all auto correlations are within the threshold limits except 1, But portmanteau test (Ljung-Box test) shows a higher p-value which means the values are indpendent. i.e We fail to reject the null hypothesis
Fitting the ARIMA model with the existing data
Let’s plot the actuals against the fitted values
Convert model and time series to dataframe for plotting
daily_death_hb_timeseries_data <- fortify(daily_death_hb_timeseries) %>%
clean_names() %>%
remove_rownames %>%
rename (date = index,
death = data)%>%
mutate(index = seq(1:nrow(daily_death_hb_timeseries)))
arima_fit_dth_resid <- ts(daily_death_hb_timeseries[1:nrow(daily_death_hb_timeseries)]) - resid(auto_arima_fit_death)
arima_fit_dth_data <- fortify(arima_fit_dth_resid) %>%
clean_names() %>%
mutate(data = round(data,2))%>%
mutate (data = ifelse(data < 0,0,data))
fit_existing_dth_data <- daily_death_hb_timeseries_data %>%
inner_join(arima_fit_dth_data, by = c("index"))
Plotting the series along with the fitted values
fit_existing_dth_plot <- fit_existing_dth_data %>%
mutate (data = ifelse(data < 0,0,data)) %>%
mutate(date = as.Date(date)) %>%
ggplot()+
aes(x=date, y = death)+
geom_line(color ="#5ab4ac")+
geom_line(aes(x= date, y = data), colour = "red" )+
xlab("Month") +
ylab("Deaths reported")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
ggtitle("Fitting the ARIMA model with existing data") +
color_theme()
ggplotly(fit_existing_dth_plot)
NA
Data Preparation :
forecast_dth_model <- forecast(auto_arima_fit_death,level = c(80, 95), h = 30)
#Convert the model to dataframe for plotting
# Negative values of the CI interval are considered as 0
forecast_dth_model_data <- fortify(forecast_dth_model) %>%
clean_names() %>%
mutate(data = round(data,2),
fitted= round(fitted,2)) %>%
mutate (lo_80 = ifelse(lo_80 < 0,0,lo_80),
lo_95 = ifelse(lo_95 < 0,0,lo_95)
)
forecast_start_date <- as.Date(max(daily_death_hb_timeseries_data$date)+1)
forecast_end_date <- as.Date(forecast_start_date+29)
forecast_dth_data <- forecast_dth_model_data %>%
filter(!(is.na(point_forecast))) %>%
mutate(date = seq(forecast_start_date,forecast_end_date, by =1)) %>%
select(-data,-fitted, -index)
fitted_dth_data <- forecast_dth_model_data %>%
filter(!(is.na(data))) %>%
inner_join(daily_death_hb_timeseries_data, by = c("index")) %>%
mutate(date = as.Date(date)) %>%
select(date, data, fitted)
Plotting the Vaccination series plus the forecast and 80 - 95% prediction intervals
annotation <- data.frame(
x = c(as.Date("13-06-2021","%d-%m-%Y"),as.Date("11-10-2021","%d-%m-%Y")),
y = c(30,40),
label = c("PAST", "FUTURE")
)
#Time series plots for the next 60 days according to best ARIMA models with 80%–95% CI.
forecast_data_dth_plot <- fitted_dth_data %>%
ggplot()+
geom_line(aes(x= date, y = data), color = "#5ab4ac")+
#geom_line(aes(x= date, y = fitted), colour = "red" )+
geom_line(aes(x= date, y =point_forecast), color ="blue", data = forecast_dth_data )+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_80, ymax = hi_80),
data = forecast_dth_data, alpha = 0.3, fill = "green")+
geom_ribbon(aes(x = date, y = point_forecast, ymin = lo_95, ymax = hi_95),
data = forecast_dth_data, alpha = 0.1)+
geom_vline(aes(xintercept=as.numeric(max(date))),color="#f1a340", linetype="dashed",data = fitted_dth_data)+
ggtitle("Projection of new Deaths") +
xlab("Month") +
ylab("Death reported")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
color_theme()+
scale_x_date(breaks = "1 month", date_labels = "%b - %y" )+
geom_text(data=annotation,
aes( x=x, y=y, label=label),
color="blue",
size=4 )
ggplotly(forecast_data_dth_plot )